library(dplyr)
library(minfi)
library(IlluminaHumanMethylation450kmanifest)
library(IlluminaHumanMethylation450kanno.ilmn12.hg19)cohort <- "GASPARONI"
data.dir <- file.path("../DATASETS",cohort)
data.dir.table <- "../DATASETS/Summary_Table/"
data.dir.raw <- "../../coMethDMR_metaAnalysis/code_validation/Meta_analysis_code/DATASETS/GASPARONI/step2_read_minfi/"
data.dir.bsfilter <- file.path(data.dir,"step2_bsConvFilter/")
data.dir.clinical.filter <- file.path(data.dir,"step3_clinical_available_filtering/")
data.dir.probes.qc <- file.path(data.dir,"step4_probesQC_filtering/")
data.dir.probes.normalization <- file.path(data.dir,"step5_normalization/")
data.dir.pca <- file.path(data.dir,"step6_pca_filtering/")
data.dir.neuron <- file.path(data.dir,"step7_neuron_comp/")
data.dir.single.cpg.pval <- file.path(data.dir,"step8_single_cpg_pval/")
data.dir.residuals <- file.path(data.dir,"step9_residuals/")
data.dir.median <- file.path(data.dir,"step10_median/")
for(p in grep("dir",ls(),value = T)) dir.create(get(p),recursive = TRUE,showWarnings = FALSE)# Create a MethylSet object from RGSet
MSet <- preprocessRaw(RGSet)
# Create [Genomic]MethylSet object
GMset <- mapToGenome(MSet)
# Get predicted sex status
estSex <- getSex(GMset)
# Compare predicted gender with phenotype gender
realSex <- data.frame(
sample = paste(phenoData$geo_accession,
phenoData$sentrix_id.ch1,
phenoData$sentrix_position.ch1,
sep = "_"),
realSex = phenoData$Sex.ch1,
stringsAsFactors = FALSE
)
compareSex <- merge(
as.data.frame(estSex), realSex,
by.x = "row.names",
by.y = "sample")
identical(compareSex$predictedSex, compareSex$realSex)Removing samples with bisulfiteConversion lower than 88.
phenoData <- phenoData[match(substr(colnames(RGSet),1,10), phenoData$geo_accession),]
nb.samples <- nrow(phenoData)
nb.female.samples <- sum(phenoData$Sex.ch1 == "F")
nb.male.samples <- sum(phenoData$Sex.ch1 == "M")bs <- data.frame(bisulfiteConversion = bscon(RGSet))
bsFilteredOut <- row.names(bs)[bs$bisulfiteConversion < 88]
RGSet <- RGSet[,!colnames(RGSet) %in% bsFilteredOut]
phenoData <- phenoData[match(substr(colnames(RGSet),1,10), phenoData$geo_accession),]
nb.samples.bc.filtered <- nrow(phenoData)
nb.female.samples.bc.filtered <- sum(phenoData$Sex.ch1 == "F")
nb.male.samples.bc.filtered <- sum(phenoData$Sex.ch1 == "M")phenoData$braak_stage.ch1 <- phenoData$braak_stage.ch1 %>% as.numeric()
phenoData$age.ch1 <- phenoData$age.ch1 %>% as.numeric()
### Subset rows and columns
pheno_df <- phenoData %>% as.data.frame() %>%
dplyr::filter(
source_name_ch1 == "Frontal Cortex" &
!is.na(phenoData$braak_stage.ch1) &
phenoData$characteristics_ch1 == "cell type: bulk"
) %>% dplyr::select(
c(
"geo_accession",
"donor_id.ch1",
"sentrix_id.ch1",
"age.ch1",
"Sex.ch1",
"braak_stage.ch1"
)
)
### Rename vars
colnames(pheno_df) <- c(
"sample", "subject.id", "slide", "age.brain", "sex", "stage"
)
nb.samples.with.clinical <- nrow(pheno_df)
nb.female.samples.with.clinical <- sum(pheno_df$sex == "F")
nb.male.samples.with.clinical <- sum(pheno_df$sex == "M")Input:
Output:
### Find which chromosome each probe is on
beta_mat <- beta_mat <- getBeta(RGSet)
probes.info <- sesameDataGet("HM450.hg19.manifest")
probes.info <- probes.info[row.names(beta_mat) %>% as.character()] %>%
as.data.frame %>%
dplyr::select(c("seqnames","start","end"))
probes.info$seqnames <- as.character(probes.info$seqnames)
nb.probes <- nrow(probes.info)
nb.chrAuto.probes <- sum(probes.info$seqnames %in% paste0("chr", 1:22))
nb.chrX.probes <- sum(probes.info$seqnames == "chrX")
nb.chrY.probes <- sum(probes.info$seqnames == "chrY")
nb.chrM.probes <- sum(probes.info$seqnames == "chrM")### subset to probes with detection P <= 0.01
detP <- detectionP(RGSet, type = "m+u")
failed.01 <- detP > 0.01
passedProbes <- rownames(failed.01)[rowMeans(failed.01) == 0]
beta_mat <- beta_mat[passedProbes, ]
probes.info <- probes.info[row.names(probes.info) %in% row.names(beta_mat),]
nb.probes.detectP <- nrow(probes.info)
nb.chrAuto.probes.detectP <- sum(probes.info$seqnames %in% paste0("chr", 1:22))
nb.chrX.probes.detectP <- sum(probes.info$seqnames == "chrX")
nb.chrY.probes.detectP <- sum(probes.info$seqnames == "chrY")
nb.chrM.probes.detectP <- sum(probes.info$seqnames == "chrM")
### keep only probes that start with "cg"
beta_mat <- beta_mat[grep("cg",rownames(beta_mat)),]
probes.info <- probes.info[row.names(probes.info) %in% row.names(beta_mat),]
nb.probes.detectP.cg <- nrow(probes.info)
nb.chrAuto.probes.detectP.cg <- sum(probes.info$seqnames %in% paste0("chr", 1:22))
nb.chrX.probes.detectP.cg <- sum(probes.info$seqnames == "chrX")
nb.chrY.probes.detectP.cg <- sum(probes.info$seqnames == "chrY")
nb.chrM.probes.detectP.cg <- sum(probes.info$seqnames == "chrM")### drop probes where SNP with MAF >= 0.01 in the last 5 bp of the probe
beta_mat <- rmSNPandCH(
object = beta_mat,
dist = 5,
mafcut = 0.01,
and = TRUE,
rmcrosshyb = FALSE,
rmXY = FALSE
)
probes.info <- probes.info[row.names(probes.info) %in% row.names(beta_mat),]
nb.probes.cg.dmrcate <- nrow(probes.info)
nb.chrAuto.probes.cg.dmrcate <- sum(probes.info$seqnames %in% paste0("chr", 1:22))
nb.chrX.probes.cg.dmrcate <- sum(probes.info$seqnames == "chrX")
nb.chrY.probes.cg.dmrcate <- sum(probes.info$seqnames == "chrY")
nb.chrM.probes.cg.dmrcate <- sum(probes.info$seqnames == "chrM")
### drop probes in chrM
probes.info <- probes.info[probes.info$seqnames != "chrM",]
beta_mat <- beta_mat[
row.names(beta_mat) %in% row.names(probes.info),
]
nb.probes.dmrcate.chrM <- nrow(probes.info)
nb.chrAuto.probes.dmrcate.chrM <- sum(probes.info$seqnames %in% paste0("chr", 1:22))
nb.chrX.probes.dmrcate.chrM <- sum(probes.info$seqnames == "chrX")
nb.chrY.probes.dmrcate.chrM <- sum(probes.info$seqnames == "chrY")Input:
Output:
probes.info <- sesameDataGet("HM450.hg19.manifest")
probes.info <- probes.info[row.names(beta_mat) %>% as.character()] %>%
as.data.frame %>%
dplyr::select(c("seqnames","start","end"))
probes.info$seqnames <- as.character(probes.info$seqnames)
findBetaChr <- function(data, chrom){
data %>%
as.data.frame() %>% # has to turn matrix into df for next step
tibble::rownames_to_column() %>% # turn rownames to a column, so rownames won't be deleted after filtering rows in next step
filter(rowname %in% row.names(probes.info[probes.info$seqnames == chrom,])) %>%
tibble::column_to_rownames() %>%
as.matrix()
}betaChrX <- findBetaChr(data = beta_mat, chrom = "chrX")
betaChrX_long <- data.frame(
beta = as.vector(betaChrX),
sample = rep(substr(colnames(betaChrX), 1,10), each = nrow(betaChrX)),
stringsAsFactors = FALSE
)
betaChrX_long <- merge(
betaChrX_long, pheno_df[, c("sample", "sex")],
by = "sample",
sort = FALSE
)
ggplot(betaChrX_long,
aes(x = sample, y = beta, fill = sex)) +
stat_boxplot(geom ='errorbar', width = 1, linetype = 1) +
geom_boxplot(width = 1, alpha = 1, outlier.shape = 1, outlier.size = 2) +
scale_fill_grey(start=1, end=0.6) +
labs(x = "sex", y = "DNA methylation beta values",
title = "DNA methylation level by sex on chromosome X") +
theme_bw() +
theme(legend.position = "none", axis.text.x = element_blank()) +
facet_wrap(~sex)betaChrY <- findBetaChr(data = beta_mat, chrom = "chrY")
betaChrY_long <- data.frame(
beta = as.vector(betaChrY),
sample = rep(substr(colnames(betaChrY), 1,10), each = nrow(betaChrY)),
stringsAsFactors = FALSE
)
betaChrY_long <- merge(
betaChrY_long, pheno_df[, c("sample", "sex")],
by = "sample",
sort = FALSE
)
ggplot(betaChrY_long,
aes(x = sample, y = beta, fill = sex)) +
stat_boxplot(geom ='errorbar', width = 1, linetype = 1) +
geom_boxplot(width = 1, alpha = 1, outlier.shape = 1, outlier.size = 2) +
scale_fill_grey(start=1, end=0.6) +
labs(x = "sex", y = "DNA methylation beta values",
title = "DNA methylation level by sex on chromosome Y") +
theme_bw() +
theme(legend.position = "none", axis.text.x = element_blank()) +
facet_wrap(~sex)chrAuto <- paste0("chr", 1:22)
betaChrAuto_ls <- lapply(seq_along(chrAuto), function(i){
dat <- findBetaChr(data = beta_mat, chrom = chrAuto[i])
data.frame(beta = as.vector(dat), chrom = "autosomes", stringsAsFactors = FALSE)
})
betaChrAuto_df <- do.call(rbind, betaChrAuto_ls)
chrSex <- paste0("chr", c("X", "Y"))
betaChrSex_ls <- lapply(seq_along(chrSex), function(i){
dat <- findBetaChr(data = beta_mat, chrom = chrSex[i])
data.frame(beta = as.vector(dat), chrom = chrSex[i], stringsAsFactors = FALSE)
})
betaChrSex_df <- do.call(rbind, betaChrSex_ls)
betaChr_df <- rbind(betaChrAuto_df, betaChrSex_df)
# # No enough memory to load all the data points and plot the figure, so use function boxplot instead
# ggplot(betaChr_df,
# aes(x = chrom, y = beta, fill = chrom)) +
# stat_boxplot(geom ='errorbar', width = 0.4, linetype = 1) +
# geom_boxplot(width = 0.4, alpha = 1, outlier.shape = 1, outlier.size = 2) +
# scale_fill_grey(start=1, end=0.6) +
# labs(x = "chromosome type", y = "DNA methylation beta values",
# title = "DNA methylation level vs. chromosome type") +
# theme_bw()
boxplot(
beta ~ chrom, data = betaChr_df, ylab = "DNA methylation beta values",
main = "DNA methylation on chromosomes")### split matrix by sex first
beta_mat_female <- beta_mat[
,substr(colnames(beta_mat), 1, 10) %in% pheno_df$sample[pheno_df$sex == "F"]]
beta_mat_male <- beta_mat[
,substr(colnames(beta_mat), 1, 10) %in% pheno_df$sample[pheno_df$sex == "M"]]
### for females
chrAuto <- paste0("chr", 1:22)
female_auto_ls <- lapply(seq_along(chrAuto), function(i){
dat <- findBetaChr(data = beta_mat_female, chrom = chrAuto[i])
data.frame(beta = as.vector(dat), chrom = "autosomes", stringsAsFactors = FALSE)
})
female_auto_df <- do.call(rbind, female_auto_ls)
female_sex_df <- findBetaChr(data = beta_mat_female, chrom = "chrX")
female_sex_df <- data.frame(beta = as.vector(female_sex_df), chrom = "chromosome X", stringsAsFactors = FALSE)
female_df <- rbind(female_auto_df, female_sex_df)
boxplot(
beta ~ chrom, data = female_df, ylab = "DNA methylation beta values",
main = "DNA methylation on chromosomes for females")### for males
chrAuto <- paste0("chr", 1:22)
male_auto_ls <- lapply(seq_along(chrAuto), function(i){
dat <- findBetaChr(data = beta_mat_male, chrom = chrAuto[i])
data.frame(beta = as.vector(dat), chrom = "autosomes", stringsAsFactors = FALSE)
})
male_auto_df <- do.call(rbind, male_auto_ls)
male_X_df <- findBetaChr(data = beta_mat_male, chrom = "chrX")
male_X_df <- data.frame(beta = as.vector(male_X_df), chrom = "chromosome X", stringsAsFactors = FALSE)
male_Y_df <- findBetaChr(data = beta_mat_male, chrom = "chrY")
male_Y_df <- data.frame(beta = as.vector(male_Y_df), chrom = "chromosome Y", stringsAsFactors = FALSE)
male_df <- rbind(male_auto_df, male_X_df, male_Y_df)
boxplot(
beta ~ chrom, data = male_df, ylab = "DNA methylation beta values",
main = "DNA methylation on chromosomes for males")chrAuto <- paste0("chr", 1:22)
betaChrAuto_ls <- lapply(seq_along(chrAuto), function(i){findBetaChr(data = beta_mat, chrom = chrAuto[i])})
betaChrAuto_df <- do.call(rbind, betaChrAuto_ls)
matboxplot(betaChrAuto_df,
groupFactor = pheno_df$sex,
xaxt = "n",
main = "Beta Values (autosomes) - before normalization")
legend('bottom',
paste0("sex ", levels(as.factor(pheno_df$sex))),
col = c(1:7), lty = 1, lwd = 3, cex = 0.70)betaChrX_df <- findBetaChr(data = beta_mat, chrom = "chrX")
matboxplot(betaChrX_df,
groupFactor = pheno_df$sex,
xaxt = "n",
main = "Beta Values (chromosome X) - before normalization")
legend('bottom',
paste0("sex ", levels(as.factor(pheno_df$sex))),
col = c(1:7), lty = 1, lwd = 3, cex = 0.70)betaChrY_df <- findBetaChr(data = beta_mat,chrom = "chrY")
matboxplot(betaChrY_df,
groupFactor = pheno_df$sex,
xaxt = "n",
main = "Beta Values (chromosome Y) - before normalization")
legend('bottom',
paste0("sex ", levels(as.factor(pheno_df$sex))),
col = c(1:7), lty = 1, lwd = 3, cex = 0.70)### split matrix by sex first
beta_mat_female <- beta_mat[
,substr(colnames(beta_mat), 1, 10) %in% pheno_df$sample[pheno_df$sex == "F"]]
beta_mat_male <- beta_mat[
,substr(colnames(beta_mat), 1, 10) %in% pheno_df$sample[pheno_df$sex == "M"]]
### split female beta matrix by chromosome (auto, X)
chrAuto <- paste0("chr", 1:22)
beta_mat_female_auto_ls <- lapply(seq_along(chrAuto), function(i){
findBetaChr(data = beta_mat_female, chrom = chrAuto[i])})
beta_mat_female_auto <- do.call(rbind, beta_mat_female_auto_ls)
beta_mat_female_X <- findBetaChr(data = beta_mat_female, chrom = "chrX")
### split male beta matrix by chromosome (auto, X, Y)
chrAuto <- paste0("chr", 1:22)
beta_mat_male_auto_ls <- lapply(seq_along(chrAuto), function(i){
findBetaChr(data = beta_mat_male, chrom = chrAuto[i])})
beta_mat_male_auto <- do.call(rbind, beta_mat_male_auto_ls)
beta_mat_male_X <- findBetaChr(data = beta_mat_male, chrom = "chrX")
beta_mat_male_Y <- findBetaChr(data = beta_mat_male, chrom = "chrY")### female autosomes
betaQN_female_auto <- lumiN(x.lumi = beta_mat_female_auto, method = "quantile")## Perform quantile normalization ...
## [1] 446784 30
## Perform quantile normalization ...
## [1] 10618 30
## Perform quantile normalization ...
## [1] 446784 27
## Perform quantile normalization ...
## [1] 10618 27
## Perform quantile normalization ...
## [1] 62 27
pheno_female_df <- pheno_df %>% filter(sex == "F")
matboxplot(betaQN_female_auto,
groupFactor = pheno_female_df$sex,
xaxt = "n",
main = "Beta Values (autosomes) - after normalization")
legend('bottom',
paste0("sex ", levels(as.factor(pheno_female_df$sex))),
col = c(1:7), lty = 1, lwd = 3, cex = 0.70)matboxplot(betaQN_female_X,
groupFactor = pheno_female_df$sex,
xaxt = "n",
main = "Beta Values (chromosome X) - after normalization")
legend('bottom',
paste0("sex ", levels(as.factor(pheno_female_df$sex))),
col = c(1:7), lty = 1, lwd = 3, cex = 0.70)pheno_male_df <- pheno_df %>% filter(sex == "M")
matboxplot(betaQN_male_auto,
groupFactor = pheno_male_df$sex,
xaxt = "n",
main = "Beta Values (autosomes) - after normalization")
legend('bottom',
paste0("sex ", levels(as.factor(pheno_male_df$sex))),
col = c(1:7), lty = 1, lwd = 3, cex = 0.70)matboxplot(betaQN_male_X,
groupFactor = pheno_male_df$sex,
xaxt = "n",
main = "Beta Values (chromosome X) - after normalization")
legend('bottom',
paste0("sex ", levels(as.factor(pheno_male_df$sex))),
col = c(1:7), lty = 1, lwd = 3, cex = 0.70)matboxplot(betaQN_male_Y,
groupFactor = pheno_male_df$sex,
xaxt = "n",
main = "Beta Values (chromosome Y) - after normalization")
legend('bottom',
paste0("sex ", levels(as.factor(pheno_male_df$sex))),
col = c(1:7), lty = 1, lwd = 3, cex = 0.70)### Order annotation in the same order as beta matrix
annotType <- sesameDataGet("HM450.hg19.manifest")
annotType$designTypeNumeric <- ifelse(annotType$designType == "I",1,2)
### Density plot for type I and type II probes
betaQNCompleteCol1_female_auto <- betaQN_female_auto[complete.cases(betaQN_female_auto[,1]), ]
annotTypeCompleteCol1_female_auto <- annotType[row.names(betaQNCompleteCol1_female_auto), ]
sm.density.compare(
betaQNCompleteCol1_female_auto[,1],
annotTypeCompleteCol1_female_auto$designTypeNumeric) +
title(main = "Density plot for type I and type II probes on female autosomes")## integer(0)
### Summary table
type12 <- annotType$designTypeNumeric[match(rownames(betaQN_female_auto),names(annotType))]
table(type12)## type12
## 1 2
## 125484 321300
set.seed(946)
doParallel::registerDoParallel(cores = 4)
betaQN_BMIQ_female_auto <- plyr::aaply(
betaQN_female_auto, 2,
function(x){
norm_ls <- BMIQ(x, design.v = type12, plots = FALSE)
return (norm_ls$nbeta)
},.progress = "time",.parallel = TRUE
) %>% t()
colnames(betaQN_BMIQ_female_auto) <- substr(colnames(betaQN_BMIQ_female_auto),1,stringr::str_length(pheno_df$sample) %>% unique)### Density plot for type I and type II probes
betaQNCompleteCol1_male_auto <- betaQN_male_auto[complete.cases(betaQN_male_auto[,1]), ]
annotTypeCompleteCol1_male_auto <- annotType[row.names(betaQNCompleteCol1_male_auto), ]
sm.density.compare(
betaQNCompleteCol1_male_auto[,1],
annotTypeCompleteCol1_male_auto$designTypeNumeric
)
title(main = "Density plot for type I and type II probes on male autosomes")### Summary table
type12 <- annotType$designTypeNumeric[match(rownames(betaQN_male_auto),names(annotType))]
table(type12)## type12
## 1 2
## 125484 321300
set.seed(946)
doParallel::registerDoParallel(cores = 4)
betaQN_BMIQ_male_auto <- plyr::aaply(
betaQN_male_auto, 2,
function(x){
norm_ls <- BMIQ(x, design.v = type12, plots = FALSE)
return (norm_ls$nbeta)
},.progress = "time",.parallel = TRUE
) %>% t()
colnames(betaQN_BMIQ_male_auto) <- substr(colnames(betaQN_BMIQ_male_auto),1,stringr::str_length(pheno_df$sample) %>% unique)### Density plot for type I and type II probes
betaQNCompleteCol1_female_X <- betaQN_female_X[complete.cases(betaQN_female_X[,1]), ]
annotTypeCompleteCol1_female_X <- annotType[row.names(betaQNCompleteCol1_female_X), ]
sm.density.compare(
betaQNCompleteCol1_female_X[,1],
annotTypeCompleteCol1_female_X$designTypeNumeric
)
title(main = "Density plot for type I and type II probes on female chromosome X")### Summary table
type12 <- annotType$designTypeNumeric[match(rownames(betaQN_female_X),names(annotType))]
table(type12)## type12
## 1 2
## 2916 7702
### Density plot for type I and type II probes
betaQNCompleteCol1_male_X <- betaQN_male_X[complete.cases(betaQN_male_X[,1]), ]
annotTypeCompleteCol1_male_X <- annotType[row.names(betaQNCompleteCol1_male_X), ]
sm.density.compare(
betaQNCompleteCol1_male_X[,1],
annotTypeCompleteCol1_male_X$designTypeNumeric
)
title(main = "Density plot for type I and type II probes on male chromosome X")### Summary table
type12 <- annotType$designTypeNumeric[match(rownames(betaQN_male_X),names(annotType))]
table(type12)## type12
## 1 2
## 2916 7702
### Density plot for type I and type II probes
betaQNCompleteCol1_male_Y <- betaQN_male_Y[complete.cases(betaQN_male_Y[,1]), ]
annotTypeCompleteCol1_male_Y <- annotType[row.names(betaQNCompleteCol1_male_Y), ]
sm.density.compare(
betaQNCompleteCol1_male_Y[,1],
annotTypeCompleteCol1_male_Y$designTypeNumeric
)
title(main = "Density plot for type I and type II probes on chromosome Y")### Summary table
type12 <- annotType$designTypeNumeric[match(rownames(betaQN_male_Y),names(annotType))]
table(type12)## type12
## 1 2
## 15 47
Input:
Output:
# plotPCA and OrderDataBySd functions
devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")pheno_female_df <- pheno_df %>% filter(sex == "F")
pheno_female_df$stage3 <- ifelse(
pheno_female_df$stage %in% c(0,1,2), "0-2",
ifelse(pheno_female_df$stage %in% c(3,4), "3-4", "5-6")
)
betaQN_BMIQ_female <- betaQN_BMIQ_female[ , pheno_female_df$sample]
### transform to m values
mvalue_mat <- log2(betaQN_BMIQ_female / (1 - betaQN_BMIQ_female)) #dim: 457402 30
### order matrix by most variable probes on top
betaOrd_mat <- OrderDataBySd(betaQN_BMIQ_female) #dim: 457402 30
mOrd_mat <- OrderDataBySd(mvalue_mat) #dim: 457402 30
identical(pheno_female_df$sample, colnames(betaOrd_mat))## [1] TRUE
## [1] TRUE
pca <- prcomp(t(betaOrd_mat[1:50000, ]),
center = TRUE,
scale = TRUE)
d <- data.frame(PC1 = pca$x[, 1], PC2 = pca$x[, 2])
meanPC1 <- mean (d$PC1)
sdPC1 <- sd (d$PC1)
meanPC2 <- mean (d$PC2)
sdPC2 <- sd (d$PC2)
out3sdPC1_1 <- meanPC1 - 3 * sdPC1
out3sdPC1_2 <- meanPC1 + 3 * sdPC1
out3sdPC2_1 <- meanPC2 - 3 * sdPC2
out3sdPC2_2 <- meanPC2 + 3 * sdPC2
d$outlier_PC1[d$PC1 >= out3sdPC1_1 & d$PC1 <= out3sdPC1_2] <- 0
d$outlier_PC1[d$PC1 < out3sdPC1_1 | d$PC1 > out3sdPC1_2] <- 1
d$outlier_PC2[d$PC2 >= out3sdPC2_1 & d$PC2 <= out3sdPC2_2] <- 0
d$outlier_PC2[d$PC2 < out3sdPC2_1 | d$PC2 > out3sdPC2_2] <- 1### Beata values
byStage <- plotPCA(
dataset = "Gasparoni: beta values",
expSorted_mat = betaOrd_mat,
pheno = pheno_female_df,
group_char = "stage3",
ntop = 50000,
center = TRUE,
scale = TRUE
)### M values
byStage <- plotPCA(
dataset = "Gasparoni: M values",
expSorted_mat = mOrd_mat,
pheno = pheno_female_df,
group_char = "stage3",
ntop = 50000,
center = TRUE,
scale = TRUE
)pheno_male_df <- pheno_df %>% filter(sex == "M")
pheno_male_df$stage3 <- ifelse(
pheno_male_df$stage %in% c(0,1,2), "0-2",
ifelse(pheno_male_df$stage %in% c(3,4), "3-4", "5-6")
)
betaQN_BMIQ_male <- betaQN_BMIQ_male[ , pheno_male_df$sample]
### transform to m values
mvalue_mat <- log2(betaQN_BMIQ_male / (1 - betaQN_BMIQ_male)) #dim: 457464 27
### order matrix by most variable probes on top
betaOrd_mat <- OrderDataBySd(betaQN_BMIQ_male) #dim: 457464 27
mOrd_mat <- OrderDataBySd(mvalue_mat) #dim: 457464 27
identical(pheno_male_df$sample, colnames(betaOrd_mat))## [1] TRUE
## [1] TRUE
pca <- prcomp(t(betaOrd_mat[1:50000, ]),
center = TRUE,
scale = TRUE)
d <- data.frame(PC1 = pca$x[, 1], PC2 = pca$x[, 2])
meanPC1 <- mean (d$PC1)
sdPC1 <- sd (d$PC1)
meanPC2 <- mean (d$PC2)
sdPC2 <- sd (d$PC2)
out3sdPC1_1 <- meanPC1 - 3 * sdPC1
out3sdPC1_2 <- meanPC1 + 3 * sdPC1
out3sdPC2_1 <- meanPC2 - 3 * sdPC2
out3sdPC2_2 <- meanPC2 + 3 * sdPC2
d$outlier_PC1[d$PC1 >= out3sdPC1_1 & d$PC1 <= out3sdPC1_2] <- 0
d$outlier_PC1[d$PC1 < out3sdPC1_1 | d$PC1 > out3sdPC1_2] <- 1
d$outlier_PC2[d$PC2 >= out3sdPC2_1 & d$PC2 <= out3sdPC2_2] <- 0
d$outlier_PC2[d$PC2 < out3sdPC2_1 | d$PC2 > out3sdPC2_2] <- 1### Beata values
byStage <- plotPCA(
dataset = "Gasparoni: beta values",
expSorted_mat = betaOrd_mat,
pheno = pheno_male_df,
group_char = "stage3",
ntop = 50000,
center = TRUE,
scale = TRUE
)### M values
byStage <- plotPCA(
dataset = "Gasparoni: M values",
expSorted_mat = mOrd_mat,
pheno = pheno_male_df,
group_char = "stage3",
ntop = 50000,
center = TRUE,
scale = TRUE
)nb.samples.pca <- ncol(betaQN_BMIQ_PCfiltered_female) + ncol(betaQN_BMIQ_PCfiltered_male)
nb.female.samples.pca <- ncol(betaQN_BMIQ_PCfiltered_female)
nb.male.samples.pca <- ncol(betaQN_BMIQ_PCfiltered_male)
dim(betaQN_BMIQ_PCfiltered_female)## [1] 457402 29
## [1] 457464 27
## [1] 29 7
## [1] 27 7
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 3.6.2 (2019-12-12)
## os macOS Catalina 10.15.4
## system x86_64, darwin15.6.0
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/New_York
## date 2020-05-07
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date lib
## acepack 1.4.1 2016-10-29 [1]
## affy 1.64.0 2019-10-29 [1]
## affyio 1.56.0 2019-10-29 [1]
## annotate 1.64.0 2019-10-29 [1]
## AnnotationDbi * 1.48.0 2019-10-29 [1]
## AnnotationFilter 1.10.0 2019-10-29 [1]
## AnnotationHub * 2.18.0 2019-10-29 [1]
## askpass 1.1 2019-01-13 [1]
## assertthat 0.2.1 2019-03-21 [1]
## backports 1.1.6 2020-04-05 [1]
## base64 2.0 2016-05-10 [1]
## base64enc 0.1-3 2015-07-28 [1]
## beanplot 1.2 2014-09-19 [1]
## BiasedUrn 1.07 2015-12-28 [1]
## Biobase * 2.46.0 2019-10-29 [1]
## BiocFileCache * 1.10.2 2019-11-08 [1]
## BiocGenerics * 0.32.0 2019-10-29 [1]
## BiocManager 1.30.10 2019-11-16 [1]
## BiocParallel * 1.20.1 2019-12-21 [1]
## BiocVersion 3.10.1 2019-06-06 [1]
## biomaRt 2.42.1 2020-03-26 [1]
## Biostrings * 2.54.0 2019-10-29 [1]
## biovizBase 1.34.1 2019-12-04 [1]
## bit 1.1-15.2 2020-02-10 [1]
## bit64 0.9-7 2017-05-08 [1]
## bitops 1.0-6 2013-08-17 [1]
## blob 1.2.1 2020-01-20 [1]
## BSgenome 1.54.0 2019-10-29 [1]
## bsseq 1.22.0 2019-10-29 [1]
## bumphunter * 1.28.0 2019-10-29 [1]
## callr 3.4.3 2020-03-28 [1]
## cellranger 1.1.0 2016-07-27 [1]
## checkmate 2.0.0 2020-02-06 [1]
## cli 2.0.2 2020-02-28 [1]
## cluster * 2.1.0 2019-06-19 [1]
## codetools 0.2-16 2018-12-24 [1]
## colorspace 1.4-1 2019-03-18 [1]
## crayon 1.3.4 2017-09-16 [1]
## crosstalk 1.1.0.1 2020-03-13 [1]
## curl 4.3 2019-12-02 [1]
## data.table 1.12.8 2019-12-09 [1]
## DBI 1.1.0 2019-12-15 [1]
## dbplyr * 1.4.3 2020-04-19 [1]
## DelayedArray * 0.12.3 2020-04-09 [1]
## DelayedMatrixStats 1.8.0 2019-10-29 [1]
## desc 1.2.0 2018-05-01 [1]
## devtools 2.3.0 2020-04-10 [1]
## dichromat 2.0-0 2013-01-24 [1]
## digest 0.6.25 2020-02-23 [1]
## DMRcate * 2.0.7 2020-01-10 [1]
## DMRcatedata * 2.2.1 2020-02-27 [1]
## DNAcopy 1.60.0 2019-10-29 [1]
## doParallel 1.0.15 2019-08-02 [1]
## doRNG 1.8.2 2020-01-27 [1]
## dplyr * 0.8.5 2020-03-07 [1]
## DSS 2.34.0 2019-10-29 [1]
## DT 0.13 2020-03-23 [1]
## edgeR 3.28.1 2020-02-26 [1]
## ellipsis 0.3.0 2019-09-20 [1]
## ensembldb 2.10.2 2019-11-20 [1]
## evaluate 0.14 2019-05-28 [1]
## ExperimentHub * 1.12.0 2019-10-29 [1]
## fansi 0.4.1 2020-01-08 [1]
## farver 2.0.3 2020-01-16 [1]
## fastmap 1.0.1 2019-10-08 [1]
## FDb.InfiniumMethylation.hg19 * 2.2.0 2020-03-18 [1]
## foreach * 1.5.0 2020-03-30 [1]
## foreign 0.8-76 2020-03-03 [1]
## Formula 1.2-3 2018-05-03 [1]
## fs 1.4.1 2020-04-04 [1]
## genefilter 1.68.0 2019-10-29 [1]
## GenomeInfoDb * 1.22.1 2020-03-27 [1]
## GenomeInfoDbData 1.2.2 2020-03-18 [1]
## GenomicAlignments 1.22.1 2019-11-12 [1]
## GenomicFeatures * 1.38.2 2020-02-15 [1]
## GenomicRanges * 1.38.0 2019-10-29 [1]
## GEOquery 2.54.1 2019-11-18 [1]
## ggplot2 * 3.3.0 2020-03-05 [1]
## ggpubr 0.2.5 2020-02-13 [1]
## ggrepel * 0.8.2 2020-03-08 [1]
## ggsignif 0.6.0 2019-08-08 [1]
## glue 1.4.0 2020-04-03 [1]
## GO.db 3.10.0 2020-03-18 [1]
## gridExtra 2.3 2017-09-09 [1]
## gtable 0.3.0 2019-03-25 [1]
## gtools 3.8.2 2020-03-31 [1]
## Gviz 1.30.3 2020-02-17 [1]
## HDF5Array 1.14.4 2020-04-13 [1]
## Hmisc 4.4-0 2020-03-23 [1]
## hms 0.5.3 2020-01-08 [1]
## htmlTable 1.13.3 2019-12-04 [1]
## htmltools 0.4.0 2019-10-04 [1]
## htmlwidgets 1.5.1 2019-10-08 [1]
## httpuv 1.5.2 2019-09-11 [1]
## httr 1.4.1 2019-08-05 [1]
## IlluminaHumanMethylation450kanno.ilmn12.hg19 * 0.6.0 2020-03-18 [1]
## IlluminaHumanMethylation450kmanifest * 0.4.0 2020-03-18 [1]
## IlluminaHumanMethylationEPICanno.ilm10b4.hg19 0.6.0 2020-03-18 [1]
## IlluminaHumanMethylationEPICmanifest 0.3.0 2020-03-18 [1]
## illuminaio * 0.28.0 2019-10-29 [1]
## interactiveDisplayBase 1.24.0 2019-10-29 [1]
## IRanges * 2.20.2 2020-01-13 [1]
## iterators * 1.0.12 2019-07-26 [1]
## jpeg 0.1-8.1 2019-10-24 [1]
## jsonlite 1.6.1 2020-02-02 [1]
## KernSmooth 2.23-17 2020-04-26 [1]
## knitr 1.28 2020-02-06 [1]
## labeling 0.3 2014-08-23 [1]
## later 1.0.0 2019-10-04 [1]
## lattice 0.20-41 2020-04-02 [1]
## latticeExtra 0.6-29 2019-12-19 [1]
## lazyeval 0.2.2 2019-03-15 [1]
## lifecycle 0.2.0 2020-03-06 [1]
## limma * 3.42.2 2020-02-03 [1]
## locfit * 1.5-9.4 2020-03-25 [1]
## lumi * 2.38.0 2019-10-29 [1]
## magrittr 1.5 2014-11-22 [1]
## MASS 7.3-51.6 2020-04-26 [1]
## Matrix 1.2-18 2019-11-27 [1]
## matrixStats * 0.56.0 2020-03-13 [1]
## mclust 5.4.6 2020-04-11 [1]
## memoise 1.1.0 2017-04-21 [1]
## methylumi * 2.32.0 2019-10-29 [1]
## mgcv 1.8-31 2019-11-09 [1]
## mime 0.9 2020-02-04 [1]
## minfi * 1.32.0 2019-10-29 [1]
## missMethyl 1.20.4 2020-01-28 [1]
## multtest 2.42.0 2019-10-29 [1]
## munsell 0.5.0 2018-06-12 [1]
## nleqslv 3.3.2 2018-05-17 [1]
## nlme 3.1-147 2020-04-13 [1]
## nnet 7.3-14 2020-04-26 [1]
## nor1mix 1.3-0 2019-06-13 [1]
## openssl 1.4.1 2019-07-18 [1]
## org.Hs.eg.db * 3.10.0 2020-03-18 [1]
## permute 0.9-5 2019-03-12 [1]
## pillar 1.4.3 2019-12-20 [1]
## pkgbuild 1.0.7 2020-04-25 [1]
## pkgconfig 2.0.3 2019-09-22 [1]
## pkgload 1.0.2 2018-10-29 [1]
## plyr 1.8.6 2020-03-03 [1]
## png 0.1-7 2013-12-03 [1]
## preprocessCore 1.48.0 2019-10-29 [1]
## prettyunits 1.1.1 2020-01-24 [1]
## processx 3.4.2 2020-02-09 [1]
## progress 1.2.2 2019-05-16 [1]
## promises 1.1.0 2019-10-04 [1]
## ProtGenerics 1.18.0 2019-10-29 [1]
## ps 1.3.2 2020-02-13 [1]
## purrr 0.3.4 2020-04-17 [1]
## quadprog 1.5-8 2019-11-20 [1]
## quantro * 1.20.0 2019-10-29 [1]
## R.methodsS3 1.8.0 2020-02-14 [1]
## R.oo 1.23.0 2019-11-03 [1]
## R.utils 2.9.2 2019-12-08 [1]
## R6 2.4.1 2019-11-12 [1]
## randomForest 4.6-14 2018-03-25 [1]
## rappdirs 0.3.1 2016-03-28 [1]
## RColorBrewer 1.1-2 2014-12-07 [1]
## Rcpp 1.0.4.6 2020-04-09 [1]
## RCurl 1.98-1.2 2020-04-18 [1]
## readr 1.3.1 2018-12-21 [1]
## readxl 1.3.1 2019-03-13 [1]
## remotes 2.1.1 2020-02-15 [1]
## reshape 0.8.8 2018-10-23 [1]
## reshape2 * 1.4.4 2020-04-09 [1]
## rhdf5 2.30.1 2019-11-26 [1]
## Rhdf5lib 1.8.0 2019-10-29 [1]
## rlang 0.4.5 2020-03-01 [1]
## rmarkdown 2.1 2020-01-20 [1]
## rngtools 1.5 2020-01-23 [1]
## ROC * 1.62.0 2019-10-29 [1]
## rpart 4.1-15 2019-04-12 [1]
## RPMM * 1.25 2017-02-28 [1]
## rprojroot 1.3-2 2018-01-03 [1]
## Rsamtools 2.2.3 2020-02-23 [1]
## RSQLite 2.2.0 2020-01-07 [1]
## rstudioapi 0.11 2020-02-07 [1]
## rtracklayer 1.46.0 2019-10-29 [1]
## ruv 0.9.7.1 2019-08-30 [1]
## S4Vectors * 0.24.4 2020-04-09 [1]
## scales * 1.1.0 2019-11-18 [1]
## scrime 1.3.5 2018-12-01 [1]
## sesame * 1.4.0 2019-10-29 [1]
## sesameData * 1.4.0 2019-11-05 [1]
## sessioninfo 1.1.1 2018-11-05 [1]
## shiny 1.4.0.2 2020-03-13 [1]
## siggenes 1.60.0 2019-10-29 [1]
## sm * 2.2-5.6 2018-09-27 [1]
## statmod 1.4.34 2020-02-17 [1]
## stringi 1.4.6 2020-02-17 [1]
## stringr 1.4.0 2019-02-10 [1]
## SummarizedExperiment * 1.16.1 2019-12-19 [1]
## survival 3.1-12 2020-04-10 [1]
## testthat 2.3.2 2020-03-02 [1]
## tibble 3.0.1 2020-04-20 [1]
## tidyr 1.0.2 2020-01-24 [1]
## tidyselect 1.0.0 2020-01-27 [1]
## TxDb.Hsapiens.UCSC.hg19.knownGene * 3.2.2 2020-03-18 [1]
## usethis 1.6.0 2020-04-09 [1]
## VariantAnnotation 1.32.0 2019-10-29 [1]
## vctrs 0.2.4 2020-03-10 [1]
## wateRmelon * 1.30.0 2019-10-29 [1]
## wheatmap 0.1.0 2018-03-15 [1]
## withr 2.2.0 2020-04-20 [1]
## xfun 0.13 2020-04-13 [1]
## XML 3.99-0.3 2020-01-20 [1]
## xml2 1.3.2 2020-04-23 [1]
## xtable 1.8-4 2019-04-21 [1]
## XVector * 0.26.0 2019-10-29 [1]
## yaml 2.2.1 2020-02-01 [1]
## zlibbioc 1.32.0 2019-10-29 [1]
## source
## CRAN (R 3.6.0)
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.2)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## Bioconductor
## Bioconductor
## Bioconductor
## CRAN (R 3.6.0)
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## Bioconductor
## Bioconductor
## Bioconductor
## CRAN (R 3.6.2)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.2)
## CRAN (R 3.6.2)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.2)
## Bioconductor
## Bioconductor
## CRAN (R 3.6.0)
## CRAN (R 3.6.2)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## Bioconductor
## Bioconductor
## Bioconductor
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## Bioconductor
## CRAN (R 3.6.0)
## Bioconductor
## CRAN (R 3.6.0)
## Bioconductor
## CRAN (R 3.6.0)
## Bioconductor
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## Bioconductor
## CRAN (R 3.6.2)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.2)
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.2)
## Bioconductor
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.2)
## Bioconductor
## Bioconductor
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.2)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.2)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## Bioconductor
## CRAN (R 3.6.0)
## Bioconductor
## CRAN (R 3.6.0)
## CRAN (R 3.6.2)
## CRAN (R 3.6.2)
## CRAN (R 3.6.0)
## CRAN (R 3.6.2)
## CRAN (R 3.6.0)
## Bioconductor
## CRAN (R 3.6.2)
## CRAN (R 3.6.0)
## Bioconductor
## Bioconductor
## Bioconductor
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.2)
## CRAN (R 3.6.2)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## Bioconductor
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.2)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## Bioconductor
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## Bioconductor
## CRAN (R 3.6.0)
## CRAN (R 3.6.2)
## CRAN (R 3.6.0)
## Bioconductor
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.2)
## CRAN (R 3.6.2)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.2)
## Bioconductor
## Bioconductor
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## Bioconductor
## CRAN (R 3.6.2)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## Bioconductor
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## Bioconductor
## CRAN (R 3.6.0)
## Bioconductor
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## Bioconductor
## Bioconductor
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## Bioconductor
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## Bioconductor
## CRAN (R 3.6.2)
## CRAN (R 3.6.0)
## CRAN (R 3.6.2)
## CRAN (R 3.6.0)
## CRAN (R 3.6.0)
## Bioconductor
## CRAN (R 3.6.2)
## Bioconductor
## CRAN (R 3.6.0)
## Bioconductor
## CRAN (R 3.6.0)
## CRAN (R 3.6.2)
## CRAN (R 3.6.2)
## CRAN (R 3.6.0)
## CRAN (R 3.6.2)
## CRAN (R 3.6.0)
## Bioconductor
## CRAN (R 3.6.0)
## Bioconductor
##
## [1] /Library/Frameworks/R.framework/Versions/3.6/Resources/library